1 Load data

Load the cleaned data from data_preparation.rmd. Assumes koi_data.Rda contains the necessary columns.

data_file_path <- "data/Rdas/koi_data.Rda"
koi_data_raw <- readRDS(data_file_path)

2 Logistic regression models

2.1 Select columns

Define target and predictor columns. Review this list carefully.

target_variable_name <- "koi_pdisposition"
predictor_cols <- c(
  "koi_period", "koi_duration", "koi_depth", "koi_prad", "koi_teq",
  "koi_insol", "koi_model_snr", "koi_steff", "koi_slogg", "koi_srad",
  "koi_smass", "koi_impact", "koi_ror", "koi_srho", "koi_sma", "koi_incl",
  "koi_dor", "koi_ldm_coeff1", "koi_ldm_coeff2", "koi_smet"
)
selected_cols <- c(target_variable_name, predictor_cols)

# Check which selected columns actually exist in the loaded data
selected_cols_exist <- selected_cols[selected_cols %in% names(koi_data_raw)]
missing_cols <- setdiff(selected_cols, selected_cols_exist)
if (length(missing_cols) > 0) {
  print(paste("Warning: The following selected columns were not found:", paste(missing_cols, collapse = ", ")))
}
if (!target_variable_name %in% selected_cols_exist) {
  stop(paste("Target variable", target_variable_name, "not found in the data!"))
}

# Subset the data
koi_data <- koi_data_raw[, selected_cols_exist]
print(paste("Selected", ncol(koi_data), "columns for analysis."))
## [1] "Selected 21 columns for analysis."

2.2 Split data into training and test sets

Stratified split based on the target variable.

set.seed(42) # for reproducibility
train_indices <- createDataPartition(koi_data[[target_variable_name]], p = 0.8, list = FALSE)
train_data <- koi_data[train_indices, ]
test_data <- koi_data[-train_indices, ]
print(paste("Training set size:", nrow(train_data)))
## [1] "Training set size: 6399"
print(paste("Test set size:", nrow(test_data)))
## [1] "Test set size: 1599"

2.3 Handle Missing Values (NA) - AFTER Splitting

Apply simple median/mode imputation. Consider more advanced methods if NAs are numerous or potentially non-random.

# --- Impute Training Data ---
numeric_predictors_train <- names(train_data)[sapply(train_data, is.numeric)]
factor_predictors_train <- names(train_data)[sapply(train_data, function(x) is.character(x) || is.factor(x))]
factor_predictors_train <- setdiff(factor_predictors_train, target_variable_name)

# Impute numeric with median (calculated from training data)
train_medians <- list()
for (col in numeric_predictors_train) {
  if (any(is.na(train_data[[col]]))) {
    median_val <- median(train_data[[col]], na.rm = TRUE)
    if (is.na(median_val)) {
      median_val <- 0
      print(paste("Warning: Train median NA for", col))
    }
    train_data[[col]][is.na(train_data[[col]])] <- median_val
    train_medians[[col]] <- median_val # Store median for applying to test set
  }
}

# Impute character/factor with mode (calculated from training data)
train_modes <- list()
for (col in factor_predictors_train) {
  if (any(is.na(train_data[[col]]))) {
    mode_val <- calculate_mode(train_data[[col]], na.rm = TRUE)
    if (is.na(mode_val)) {
      mode_val <- "Missing"
      print(paste("Warning: Train mode NA for", col))
    }
    train_data[[col]][is.na(train_data[[col]])] <- mode_val
    train_modes[[col]] <- mode_val # Store mode for applying to test set
  }
}

# Remove rows with NA in target (TRAIN)
rows_before_na_target_train <- nrow(train_data)
train_data <- train_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(train_data) < rows_before_na_target_train) print("Removed rows with NA target in TRAIN")

# --- Impute Test Data (using values from training data) ---
for (col in names(train_medians)) {
  if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
    test_data[[col]][is.na(test_data[[col]])] <- train_medians[[col]]
  }
}

# Impute character/factor with *training* mode
for (col in names(train_modes)) {
  if (col %in% names(test_data) && any(is.na(test_data[[col]]))) {
    test_data[[col]][is.na(test_data[[col]])] <- train_modes[[col]]
  }
}
# Handle any remaining NAs in test set predictors if medians/modes couldn't be calculated or applied
na_check_test <- colSums(is.na(test_data %>% select(-all_of(target_variable_name))))
if (any(na_check_test > 0)) {
  print("Warning: NAs still present in test set predictors after imputation:")
  print(na_check_test[na_check_test > 0])
  test_data <- test_data[complete.cases(test_data %>% select(-all_of(target_variable_name))), ]
  print("Removed test rows with remaining NAs in predictors.")
}

# Remove rows with NA in target (TEST)
rows_before_na_target_test <- nrow(test_data)
test_data <- test_data %>% filter(!is.na(.data[[target_variable_name]]))
if (nrow(test_data) < rows_before_na_target_test) print("Removed rows with NA target in TEST")

2.4 Convert Predictors and Target to Factors

Ensure categorical predictors and the target variable are factors with consistent levels.

# Convert target variable to factor with desired levels and labels
train_data[[target_variable_name]] <- factor(train_data[[target_variable_name]],
  levels = c("CANDIDATE", "FALSE POSITIVE"),
  labels = c("candidate", "false_positive")
)
test_data[[target_variable_name]] <- factor(test_data[[target_variable_name]],
  levels = c("CANDIDATE", "FALSE POSITIVE"),
  labels = c("candidate", "false_positive")
)
print("Converted target variable to factor with levels:")
## [1] "Converted target variable to factor with levels:"
print(levels(train_data[[target_variable_name]]))
## [1] "candidate"      "false_positive"
# Define positive class based on the new factor levels
positive_class <- levels(train_data[[target_variable_name]])[2]
if (is.na(positive_class)) stop("Could not determine positive class level after factoring.")
print(paste("Positive class for evaluation:", positive_class))
## [1] "Positive class for evaluation: false_positive"

2.5 Scale Numerical Predictors

Scale numeric predictors AFTER splitting and handling NAs/factors. Fit scaler only on training data.

numeric_predictors_final <- names(train_data)[sapply(train_data, is.numeric)]

# Check for zero variance columns before scaling
nzv <- nearZeroVar(train_data[, numeric_predictors_final], saveMetrics = TRUE)
cols_to_scale <- rownames(nzv)[!nzv$zeroVar]
if (length(cols_to_scale) < length(numeric_predictors_final)) {
  print("Warning: Found zero-variance numeric columns, excluding from scaling:")
  print(rownames(nzv)[nzv$zeroVar])
}

if (length(cols_to_scale) > 0) {
  scaler <- preProcess(train_data[, cols_to_scale], method = c("center", "scale"))
  train_data_scaled <- predict(scaler, train_data)
  test_data_scaled <- predict(scaler, test_data)
  print(paste("Scaled numeric predictors:", paste(cols_to_scale, collapse = ", ")))
} else {
  print("No non-zero variance numeric predictors found to scale.")
  train_data_scaled <- train_data
  test_data_scaled <- test_data
}
## [1] "Scaled numeric predictors: koi_period, koi_duration, koi_depth, koi_prad, koi_teq, koi_insol, koi_model_snr, koi_steff, koi_slogg, koi_srad, koi_smass, koi_impact, koi_ror, koi_srho, koi_sma, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, koi_smet"

2.6 Model Fitting

Define the formula and fit the GLM.

glm_original_formula <- as.formula(paste(target_variable_name, "~ ."))
print(paste("Using formula:", deparse(glm_original_formula)))
## [1] "Using formula: koi_pdisposition ~ ."
glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)

summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.09986    0.09556  11.510  < 2e-16 ***
## koi_period      0.25772    0.21190   1.216  0.22390    
## koi_duration    1.15093    0.09986  11.526  < 2e-16 ***
## koi_depth       2.19920    0.40141   5.479 4.28e-08 ***
## koi_prad        0.25948    0.38662   0.671  0.50212    
## koi_teq         1.82665    0.11695  15.619  < 2e-16 ***
## koi_insol      -0.44363    0.05698  -7.786 6.90e-15 ***
## koi_model_snr  -0.04758    0.08904  -0.534  0.59313    
## koi_steff      -0.48874    0.15495  -3.154  0.00161 ** 
## koi_slogg       0.67690    0.08798   7.694 1.43e-14 ***
## koi_srad        0.01393    0.07724   0.180  0.85685    
## koi_smass       0.44922    0.09916   4.530 5.89e-06 ***
## koi_impact      0.53968    0.10365   5.207 1.92e-07 ***
## koi_ror         2.55320    0.49385   5.170 2.34e-07 ***
## koi_srho        0.02435    0.03015   0.808  0.41923    
## koi_sma        -0.29034    0.24926  -1.165  0.24411    
## koi_incl       -0.41634    0.09885  -4.212 2.53e-05 ***
## koi_dor         0.58369    0.08070   7.233 4.74e-13 ***
## koi_ldm_coeff1 -1.05599    0.46039  -2.294  0.02181 *  
## koi_ldm_coeff2 -0.98381    0.36463  -2.698  0.00697 ** 
## koi_smet       -0.56487    0.05652  -9.995  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8870.4  on 6398  degrees of freedom
## Residual deviance: 5199.5  on 6378  degrees of freedom
## AIC: 5241.5
## 
## Number of Fisher Scoring iterations: 9

2.7 Model interpretation

Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE”:

  • koi_duration (1.15093 ***): Longer transit durations are associated with a higher chance of being a “false_positive”.
    This might seem counterintuitive, as real planets have specific durations. However, some types of false positives (e.g., grazing eclipsing binaries, certain stellar variability) might also produce long-duration events that get flagged.

  • koi_depth (2.19920 ***): Deeper transits are associated with a higher chance of being a “false_positive”.
    This is also potentially counterintuitive, as deep transits are strong signals. However, very deep events might be more likely to be stellar eclipsing binaries (another star, not a planet) rather than a planet, especially if koi_ror is also large.

  • koi_teq (1.82665 ***): Higher equilibrium temperatures are associated with a higher chance of being a “false_positive”.
    This could mean that objects with extremely high calculated T_eq are more likely to be misclassified astrophysical phenomena or certain types of eclipsing binaries rather than stable planetary candidates.

  • koi_slogg (0.67690 ***): Higher stellar surface gravity (log₁₀(cm s⁻²)) is associated with a higher chance of being a “false_positive”.
    Stars with very high surface gravity are typically compact (e.g., some types of dwarfs or even white dwarfs if they were in the sample). Transits around these might have different characteristics or be mimicked by phenomena that are ultimately classified as false positives. Alternatively, if true candidates are mostly around typical main-sequence stars, deviations from that slogg might increase FP likelihood.

  • koi_smass (0.44922 ***): Higher stellar mass is associated with a higher chance of being a “false_positive”.
    Similar to koi_slogg, perhaps transits around more massive stars in dataset are more prone to being astrophysical false positives or are harder to distinguish.

  • koi_impact (0.53968 ***): A higher impact parameter is associated with a higher chance of being a “false_positive”.
    This makes perfect sense, a higher impact parameter means that the transit is more grazing. Grazing transits often have a V-shape (rather than a U-shape) and are a common characteristic of eclipsing binaries or other false positive scenarios.

  • koi_ror (2.55320 ***): A larger planet-to-star radius ratio is associated with a higher chance of being a “false_positive”.
    This is a very strong indicator. While a larger planet is easier to detect, a very large koi_ror often points towards an eclipsing binary system (two stars orbiting each other) rather than a planet transiting a star, as planets are generally much smaller than their host stars. This is a classic false positive signature.

  • koi_dor (0.58369 ***): A larger planet-star distance over star radius (distance at mid-transit normalized by stellar radius) is associated with a higher chance of being a “false_positive”.
    koi_dor is related to the impact parameter and how far from the star’s center the transit occurs, scaled by the star’s radius. A larger value could indicate a more grazing transit geometry, consistent with koi_impact also pointing towards “false_positive”.

Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE” (i.e., INCREASED Likelihood of being a “CANDIDATE”):

  • koi_insol (-0.44363 ***): Higher insolation flux is associated with a lower chance of being a “false_positive” (i.e., more likely a “candidate”).
    This suggests that objects receiving more stellar flux (often closer planets) are more likely to be genuine candidates in the dataset once other factors are taken into account.

  • koi_steff (-0.48874 **): Higher stellar effective temperature is associated with a lower chance of being a “false_positive” (i.e., more likely a “candidate”).
    Transits around hotter stars might be cleaner or less prone to certain types of astrophysical mimics that get flagged as false positives when occurring around cooler stars (which can have more stellar activity).

  • koi_incl (-0.41634 ***): Higher inclination is associated with a lower chance of being a “false_positive” (i.e., more likely a “candidate”).
    This is right. An inclination closer to 90 degrees (edge-on) is required for a transit. If “higher” koi_incl means closer to 90 degrees, then a more edge-on system is less likely to be a false positive and more likely a true candidate. The model is correctly identifying that good, central (or near-central) transits are more likely to be candidates.

  • koi_ldm_coeff1 (-1.05599 *), koi_ldm_coeff2 (-0.98381 **): Specific trends in these limb darkening coefficients are associated with a lower chance of being a “false_positive”.
    This implies that transit light curve shapes that are well-fit with these particular limb darkening characteristics are more typical of genuine planetary candidates than false positives.

  • koi_smet (-0.56487 ***): Higher stellar metallicity is associated with a lower chance of being a “false_positive” (i.e., more likely a “candidate”).
    This aligns very well with astrophysical understanding since stars with higher metallicity are known to be more likely to host planets (especially gas giants). So, the model finds that higher metallicity makes an object more likely to be a genuine candidate.

2.8 Model Performance

Predict on the test set and evaluate.

original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)

# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] 

original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
  levels = test_target_levels
)

test_target_factor <- test_data_scaled[[target_variable_name]]

cm <- confusionMatrix(original_preds,
  test_target_factor,
  positive = positive_class
) # Ensure positive class is correctly specified
cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            708            198
##   false_positive        98            595
##                                          
##                Accuracy : 0.8149         
##                  95% CI : (0.795, 0.8336)
##     No Information Rate : 0.5041         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6294         
##                                          
##  Mcnemar's Test P-Value : 8.702e-09      
##                                          
##             Sensitivity : 0.7503         
##             Specificity : 0.8784         
##          Pos Pred Value : 0.8586         
##          Neg Pred Value : 0.7815         
##              Prevalence : 0.4959         
##          Detection Rate : 0.3721         
##    Detection Prevalence : 0.4334         
##       Balanced Accuracy : 0.8144         
##                                          
##        'Positive' Class : false_positive 
## 

Plot confusion matrix

cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_original <- roc(
  response = test_target_factor,
  predictor = original_probs,
  levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))
## [1] "GLM Original Features - AUC: 0.8882"

Plot ROC curve

plot(roc_original,
  main = "ROC Curve - GLM Original Features",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

2.9 Outlier detection

Check for influential points using cook’s distance.

influenceIndexPlot(glm_original_model, vars = "C")

Find and remove all influential points with Cook’s distance > 0.5

cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))
## [1] "Number of influential points to remove: 2"
# Remove all influential points
if (length(outlier_indices) > 0) {
  train_data_scaled <- train_data_scaled[-outlier_indices, ]
  print(paste("Removed", length(outlier_indices), "influential points"))
}
## [1] "Removed 2 influential points"

Refit the GLM with the updated training data.

glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.14156    0.08507  13.419  < 2e-16 ***
## koi_period      0.29864    0.21669   1.378  0.16814    
## koi_duration    1.29810    0.10383  12.502  < 2e-16 ***
## koi_depth       0.77960    0.34790   2.241  0.02503 *  
## koi_prad        0.01257    0.24641   0.051  0.95933    
## koi_teq         1.99661    0.12031  16.595  < 2e-16 ***
## koi_insol      -0.83265    0.09432  -8.828  < 2e-16 ***
## koi_model_snr  -0.08312    0.08217  -1.012  0.31171    
## koi_steff      -0.51090    0.15898  -3.214  0.00131 ** 
## koi_slogg       0.75842    0.08950   8.474  < 2e-16 ***
## koi_srad        0.08568    0.07305   1.173  0.24087    
## koi_smass       0.51646    0.10212   5.057 4.25e-07 ***
## koi_impact      0.40769    0.10601   3.846  0.00012 ***
## koi_ror         5.73441    0.50313  11.398  < 2e-16 ***
## koi_srho        0.02551    0.03030   0.842  0.39986    
## koi_sma        -0.40431    0.25554  -1.582  0.11361    
## koi_incl       -0.25444    0.10093  -2.521  0.01170 *  
## koi_dor         0.65097    0.08326   7.819 5.35e-15 ***
## koi_ldm_coeff1 -1.01362    0.47104  -2.152  0.03141 *  
## koi_ldm_coeff2 -0.95850    0.37285  -2.571  0.01015 *  
## koi_smet       -0.57885    0.05791  -9.996  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8867.7  on 6396  degrees of freedom
## Residual deviance: 5051.9  on 6376  degrees of freedom
## AIC: 5093.9
## 
## Number of Fisher Scoring iterations: 8

Check outliers again.

influenceIndexPlot(glm_original_model, vars = "C")

Find and remove all influential points with Cook’s distance > 0.5

cooks_d <- cooks.distance(glm_original_model)
outlier_indices <- which(cooks_d > 0.5)
print(paste("Number of influential points to remove:", length(outlier_indices)))
## [1] "Number of influential points to remove: 1"
# Remove all influential points
if (length(outlier_indices) > 0) {
  train_data_scaled <- train_data_scaled[-outlier_indices, ]
  print(paste("Removed", length(outlier_indices), "influential points"))
}
## [1] "Removed 1 influential points"

Refit the GLM with the updated training data.

glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.13220    0.08512  13.301  < 2e-16 ***
## koi_period      0.27949    0.21694   1.288  0.19763    
## koi_duration    1.30688    0.10405  12.561  < 2e-16 ***
## koi_depth       0.77501    0.34777   2.229  0.02585 *  
## koi_prad        0.01662    0.26552   0.063  0.95008    
## koi_teq         2.02544    0.12105  16.732  < 2e-16 ***
## koi_insol      -1.15821    0.12615  -9.181  < 2e-16 ***
## koi_model_snr  -0.08300    0.08220  -1.010  0.31262    
## koi_steff      -0.51294    0.15913  -3.223  0.00127 ** 
## koi_slogg       0.76720    0.08972   8.551  < 2e-16 ***
## koi_srad        0.09537    0.07306   1.305  0.19175    
## koi_smass       0.51567    0.10259   5.026 5.00e-07 ***
## koi_impact      0.41457    0.10607   3.908 9.29e-05 ***
## koi_ror         5.74178    0.51005  11.257  < 2e-16 ***
## koi_srho        0.02530    0.03031   0.835  0.40399    
## koi_sma        -0.38305    0.25592  -1.497  0.13446    
## koi_incl       -0.24436    0.10075  -2.426  0.01529 *  
## koi_dor         0.65519    0.08338   7.858 3.91e-15 ***
## koi_ldm_coeff1 -1.00526    0.47154  -2.132  0.03302 *  
## koi_ldm_coeff2 -0.95198    0.37324  -2.551  0.01075 *  
## koi_smet       -0.58088    0.05801 -10.013  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8866.2  on 6395  degrees of freedom
## Residual deviance: 5045.5  on 6375  degrees of freedom
## AIC: 5087.5
## 
## Number of Fisher Scoring iterations: 8

Check outliers again.

influenceIndexPlot(glm_original_model, vars = "C")

Now the model should be good.

2.10 Model Evaluation without outliers

Predict on the test set and evaluate.

original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)

# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels

original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
  levels = test_target_levels
)

test_target_factor <- test_data_scaled[[target_variable_name]]

cm <- confusionMatrix(original_preds,
  test_target_factor,
  positive = positive_class
) # Ensure positive class is correctly specified
cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            708            186
##   false_positive        98            607
##                                           
##                Accuracy : 0.8224          
##                  95% CI : (0.8028, 0.8408)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6444          
##                                           
##  Mcnemar's Test P-Value : 2.437e-07       
##                                           
##             Sensitivity : 0.7654          
##             Specificity : 0.8784          
##          Pos Pred Value : 0.8610          
##          Neg Pred Value : 0.7919          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3796          
##    Detection Prevalence : 0.4409          
##       Balanced Accuracy : 0.8219          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_original <- roc(
  response = test_target_factor,
  predictor = original_probs,
  levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))
## [1] "GLM Original Features - AUC: 0.8874"

Plot ROC curve

plot(roc_original,
  main = "ROC Curve - GLM Original Features",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

2.11 Residual analysis

  • Plot 1: Residuals vs Fitted (Linear Predictor) Look for a lack of strong patterns, random scatter around the smoothed line (though some banding is expected for binary data). Curvature might indicate link function issues or missing non-linear terms.
  • Plot 3: Residuals vs Leverage Look for points with high leverage (far right) AND large standardized residuals (far top/bottom). Points outside Cook’s distance contours (dashed lines) are influential.
plot(glm_original_model, which = c(1, 5))

Calculates Variance Inflation Factors. Important when using original predictors. High VIF (> 5 or > 10) suggests multicollinearity might be inflating standard errors of coefficients, making them unstable/unreliable.

vif_values <- vif(glm_original_model)
print("VIF Values:")
## [1] "VIF Values:"
print(vif_values)
##     koi_period   koi_duration      koi_depth       koi_prad        koi_teq 
##      45.365541       3.146117       2.405060       2.099924       5.875834 
##      koi_insol  koi_model_snr      koi_steff      koi_slogg       koi_srad 
##       1.658242       1.735790      15.228120       4.938804       2.532314 
##      koi_smass     koi_impact        koi_ror       koi_srho        koi_sma 
##       5.751579       2.101254       2.885427       1.112261      63.188275 
##       koi_incl        koi_dor koi_ldm_coeff1 koi_ldm_coeff2       koi_smet 
##       2.014036       4.972557     173.500591     113.851021       2.123956
# Check for high values
max_vif <- max(vif_values, na.rm = TRUE)
print(paste("Max VIF:", round(max_vif, 2)))
## [1] "Max VIF: 173.5"

From the VIF values, we notice the following: 1. Physical Relationships: koi_period and koi_sma are expected to be highly correlated due to Kepler’s Third Law. 2. Parameter Dependencies: Limb darkening coefficients (koi_ldm_coeff1, koi_ldm_coeff2) are often highly correlated with each other and can also be correlated with stellar temperature (koi_steff). Stellar parameters (koi_steff, koi_teq, koi_smass) are often correlated among themselves.

We proceed by removing highly correlated predictors from the model.

# Remove highly correlated predictors
predictors_to_remove <- c("koi_sma", "koi_ldm_coeff2")
train_data_scaled <- train_data_scaled[, !names(train_data_scaled) %in% predictors_to_remove]
test_data_scaled <- test_data_scaled[, !names(test_data_scaled) %in% predictors_to_remove]

# Update predictor_cols
predictor_cols <- predictor_cols[!predictor_cols %in% predictors_to_remove]

# Refit the model with updated data
glm_original_model <- glm(glm_original_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_original_model)
## 
## Call:
## glm(formula = glm_original_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.12710    0.08484  13.286  < 2e-16 ***
## koi_period     -0.02010    0.07338  -0.274  0.78415    
## koi_duration    1.26967    0.10099  12.573  < 2e-16 ***
## koi_depth       0.78093    0.34724   2.249  0.02452 *  
## koi_prad        0.01503    0.26629   0.056  0.95498    
## koi_teq         2.12831    0.10022  21.237  < 2e-16 ***
## koi_insol      -1.20622    0.12232  -9.861  < 2e-16 ***
## koi_model_snr  -0.07166    0.08178  -0.876  0.38091    
## koi_steff      -0.17450    0.08311  -2.100  0.03576 *  
## koi_slogg       0.79903    0.08851   9.028  < 2e-16 ***
## koi_srad        0.11529    0.07186   1.604  0.10861    
## koi_smass       0.46510    0.09684   4.803 1.57e-06 ***
## koi_impact      0.41540    0.10568   3.931 8.47e-05 ***
## koi_ror         5.69350    0.50833  11.200  < 2e-16 ***
## koi_srho        0.02612    0.03047   0.857  0.39119    
## koi_incl       -0.24948    0.10070  -2.477  0.01323 *  
## koi_dor         0.62047    0.07931   7.823 5.14e-15 ***
## koi_ldm_coeff1  0.18397    0.06020   3.056  0.00224 ** 
## koi_smet       -0.66341    0.04795 -13.836  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8866.2  on 6395  degrees of freedom
## Residual deviance: 5054.1  on 6377  degrees of freedom
## AIC: 5092.1
## 
## Number of Fisher Scoring iterations: 8

2.12 Updated interpretation

Overall Model Changes & Fit:

  • AIC: The AIC has decreased from 5241.5 in the previous model to 5092.1 in this model. A lower AIC generally suggests a better trade-off between model fit and complexity. This is an improvement.
  • Deviance: The residual deviance has also decreased (5054.1 vs. 5199.5), indicating a better fit to the data with this new model specification.
  • Predictors: koi_ldm_coeff2 is no longer present. This change in model structure is likely contributing to the improved AIC.

Coefficients Interpretation (Positive Class: “false_positive”):

  • (Intercept) 1.12710 (***): When all scaled predictor variables are at their mean value (0), the log-odds of the KOI being a “false_positive” is 1.12710. This is highly significant.

Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE”:

  • koi_duration (1.26967 ***): Longer transit durations are strongly associated with an increased likelihood of being a “false_positive”. Consistent with the previous model, slightly stronger effect.

  • koi_depth (0.78093 *): Deeper transits are associated with an increased likelihood of being a “false_positive”. Still positive, but the effect size and significance are notably reduced compared to the previous 2.19920 ***.

  • koi_teq (2.12831 ***): Higher equilibrium temperatures are very strongly associated with an increased likelihood of being a “false_positive”. Consistent, effect size increased from 1.82665 ***.

  • koi_slogg (0.79903 ***): Higher stellar surface gravity is very strongly associated with an increased likelihood of being a “false_positive”. Consistent, slightly stronger effect than 0.67690 ***.

  • koi_smass (0.46510 ***): Higher stellar mass is strongly associated with an increased likelihood of being a “false_positive”. Consistent, similar magnitude to 0.44922 ***.

  • koi_impact (0.41540 ***): A higher impact parameter (more grazing transit) is strongly associated with an increased likelihood of being a “false_positive”. Consistent, slightly smaller magnitude than 0.53968 ***.

  • koi_ror (5.69350 ***): A larger planet-to-star radius ratio is very strongly associated with an increased likelihood of being a “false_positive”. This effect has become much stronger—previously 2.55320 ***—highlighting koi_ror as an even more critical indicator of false positives.

  • koi_dor (0.62047 ***): Larger planet-star distance over star radius (at mid-transit) is very strongly associated with an increased likelihood of being a “false_positive”. Consistent, similar magnitude to 0.58369 ***.

  • koi_ldm_coeff1 (0.18397 **): This coefficient has flipped sign and changed significance. Previously -1.05599 *, now positive. Indicates that higher values of this coefficient are now associated with false positives, possibly due to the removal of koi_ldm_coeff2.

Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE”:

  • koi_insol (-1.20622 ***): Higher insolation flux is very strongly associated with a decreased likelihood of being a “false_positive”. Consistent, and effect size more than doubled from -0.44363 ***.

  • koi_steff (-0.17450 *): Higher stellar effective temperature is associated with a decreased likelihood of being a “false_positive”. Consistent, but effect size and significance reduced from -0.48874 **.

  • koi_incl (-0.24948 *): Higher inclination (closer to 90°, more edge-on) is associated with a decreased likelihood of being a “false_positive”. Consistent, but effect size and significance reduced from -0.41634 ***.

  • koi_smet (-0.66341 ***): Higher stellar metallicity is very strongly associated with a decreased likelihood of being a “false_positive”. Consistent, slightly stronger effect than -0.56487 ***.

2.13 Model Evaluation without correlated predictors

Predict on the test set and evaluate.

original_probs <- predict(glm_original_model, newdata = test_data_scaled, type = "response")
# Ensure probabilities are valid
original_probs <- pmax(pmin(original_probs, 1 - 1e-15), 1e-15)

# Convert probabilities to class predictions using a 0.5 threshold
threshold <- 0.5
# Use the levels from the *test* data target factor for creating prediction factor
test_target_levels <- levels(test_data_scaled[[target_variable_name]])
negative_class <- test_target_levels[1] # Should be 'candidate'
# Note: positive_class was defined earlier based on train_data levels

original_preds <- factor(ifelse(original_probs > threshold, positive_class, negative_class),
  levels = test_target_levels
)

test_target_factor <- test_data_scaled[[target_variable_name]]

cm <- confusionMatrix(original_preds,
  test_target_factor,
  positive = positive_class
) # Ensure positive class is correctly specified
cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            704            187
##   false_positive       102            606
##                                           
##                Accuracy : 0.8193          
##                  95% CI : (0.7995, 0.8378)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6382          
##                                           
##  Mcnemar's Test P-Value : 7.765e-07       
##                                           
##             Sensitivity : 0.7642          
##             Specificity : 0.8734          
##          Pos Pred Value : 0.8559          
##          Neg Pred Value : 0.7901          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3790          
##    Detection Prevalence : 0.4428          
##       Balanced Accuracy : 0.8188          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_original <- roc(
  response = test_target_factor,
  predictor = original_probs,
  levels = test_target_levels
) # Specify levels explicitly
print(paste("GLM Original Features - AUC:", round(auc(roc_original), 4)))
## [1] "GLM Original Features - AUC: 0.8875"

Plot ROC curve

plot(roc_original,
  main = "ROC Curve - GLM Original Features",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

vif_values <- vif(glm_original_model)
print("VIF Values:")
## [1] "VIF Values:"
print(vif_values)
##     koi_period   koi_duration      koi_depth       koi_prad        koi_teq 
##       5.352631       2.999511       2.404310       2.116069       4.028664 
##      koi_insol  koi_model_snr      koi_steff      koi_slogg       koi_srad 
##       1.570927       1.729637       4.067829       4.808042       2.472033 
##      koi_smass     koi_impact        koi_ror       koi_srho       koi_incl 
##       5.048217       2.095302       2.886955       1.112462       2.006146 
##        koi_dor koi_ldm_coeff1       koi_smet 
##       4.665995       2.771644       1.431394
# Check for high values
max_vif <- max(vif_values, na.rm = TRUE)
print(paste("Max VIF:", round(max_vif, 2)))
## [1] "Max VIF: 5.35"

Now the model should be good.

2.14 Summary of GLM Results

The logistic regression model was fitted using scaled original numeric features (excluding the binary flag variables) to predict the koi_pdisposition.

  1. Model Fit & Significant Predictors: The overall model shows a highly significant improvement over the null model (intercept only), indicated by the large drop in deviance (Null: 8866.2, Residual: 5045.5). Several predictors showed a statistically significant relationship (p < 0.05) with the target variable in this model: koi_duration, koi_depth, koi_teq, koi_insol, koi_steff, koi_slogg, koi_smass, koi_impact, koi_ror, koi_incl, koi_dor, koi_ldm_coeff1, koi_ldm_coeff2, and koi_smet. Notable predictors that were not statistically significant in this specific model include koi_period, koi_prad, koi_model_snr, koi_srad, koi_srho, and koi_sma. This could be due to collinearity with other included variables.

  2. Residual Analysis (Non-linearity Check): Tests for non-linearity (curvature) were performed on the model’s residuals. Significant evidence (p < 0.05) of non-linear patterns remaining in the residuals was found for the following predictors: koi_depth, koi_teq, koi_insol, koi_slogg, koi_srad, koi_smass, koi_impact, koi_sma, koi_incl, koi_dor, and koi_smet. koi_period was borderline significant. This indicates that the assumption of a purely linear relationship between these predictors and the log-odds of the outcome is likely violated.

  3. Conclusion: The GLM identifies many significant predictors for exoplanet disposition. However, the residual analysis strongly suggests that the model’s linear structure is insufficient to capture the underlying relationships for numerous variables. This indicates that model performance could potentially be improved by incorporating non-linear effects or using models better suited to capturing such complexities.

3 Model improvements

3.1 GAM model

Let’s start by fitting a GAM model in order to use smooth functions s() for predictors showing non-linearity in residual analysis.

nonlinear_predictors_gam <- c(
  "koi_depth", "koi_teq", "koi_insol", "koi_slogg",
  "koi_srad", "koi_smass", "koi_impact", "koi_sma",
  "koi_incl", "koi_dor", "koi_smet", "koi_period"
)

nonlinear_predictors_gam <- nonlinear_predictors_gam[nonlinear_predictors_gam %in% names(train_data_scaled)]

linear_predictors_gam <- setdiff(predictor_cols, nonlinear_predictors_gam)

Prepare the formula for the GAM model.

smooth_terms <- if (length(nonlinear_predictors_gam) > 0) paste(paste0("s(", nonlinear_predictors_gam, ")"), collapse = " + ") else NULL
linear_terms <- if (length(linear_predictors_gam) > 0) paste(linear_predictors_gam, collapse = " + ") else NULL

gam_formula_str <- paste(target_variable_name, "~", paste(c(smooth_terms, linear_terms), collapse = " + "))
# Remove potential trailing/leading "+" if one group was empty
gam_formula_str <- gsub("\\+ $", "", gsub("^\\+ ", "", gsub(" \\+ \\+ ", " + ", gam_formula_str)))

gam_formula <- as.formula(gam_formula_str)
print(paste("Using GAM formula:", gam_formula_str))
## [1] "Using GAM formula: koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) + s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) + s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) + koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror + koi_srho + koi_ldm_coeff1"

Fit the GAM model.

gam_model <- gam(gam_formula,
  data = train_data_scaled,
  family = binomial(link = "logit"),
  method = "REML"
)
summary(gam_model)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## koi_pdisposition ~ s(koi_depth) + s(koi_teq) + s(koi_insol) + 
##     s(koi_slogg) + s(koi_srad) + s(koi_smass) + s(koi_impact) + 
##     s(koi_incl) + s(koi_dor) + s(koi_smet) + s(koi_period) + 
##     koi_duration + koi_prad + koi_model_snr + koi_steff + koi_ror + 
##     koi_srho + koi_ldm_coeff1
## 
## Parametric coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     0.94561    0.11063   8.548  < 2e-16 ***
## koi_duration    1.73728    0.14344  12.112  < 2e-16 ***
## koi_prad       -0.28360    0.29322  -0.967  0.33345    
## koi_model_snr  -0.13978    0.08423  -1.659  0.09702 .  
## koi_steff       0.01356    0.17555   0.077  0.93843    
## koi_ror         6.64085    0.79541   8.349  < 2e-16 ***
## koi_srho       -0.00168    0.03094  -0.054  0.95669    
## koi_ldm_coeff1  0.20498    0.07090   2.891  0.00384 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq p-value    
## s(koi_depth)  2.721  3.404   6.681  0.0983 .  
## s(koi_teq)    5.173  6.144 309.050  <2e-16 ***
## s(koi_insol)  1.000  1.000   0.004  0.9485    
## s(koi_slogg)  4.254  5.476  53.962  <2e-16 ***
## s(koi_srad)   6.971  7.765   3.574  0.8206    
## s(koi_smass)  1.002  1.004   0.513  0.4749    
## s(koi_impact) 1.817  2.077  14.228  0.0305 *  
## s(koi_incl)   2.289  2.865   3.743  0.3121    
## s(koi_dor)    4.856  6.004  96.825  <2e-16 ***
## s(koi_smet)   5.843  6.826  97.286  <2e-16 ***
## s(koi_period) 2.696  3.438   4.490  0.2376    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.544   Deviance explained = 46.3%
## -REML = 2458.4  Scale est. = 1         n = 6396

Plot of the splines effect:

par(mfrow = c(2, 3))
for (i in 1:11) {
  plot(gam_model, select = i, shade = TRUE, shade.col = "lightblue")
  abline(h = 0, lty = "dashed")
}

As we can see from the plots, not all the splines are significant. For example koi_period, koi_insol, koi_smass, koi_period, koi_incl, koi_depth and koi_impact are not significant. We will try to remove them from the model.

gam_model <- update(gam_model, . ~ . - s(koi_period) - s(koi_insol) - s(koi_smass) - s(koi_period) - s(koi_incl) - s(koi_depth) - s(koi_impact))
summary(gam_model)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## koi_pdisposition ~ s(koi_teq) + s(koi_slogg) + s(koi_srad) + 
##     s(koi_dor) + s(koi_smet) + koi_duration + koi_prad + koi_model_snr + 
##     koi_steff + koi_ror + koi_srho + koi_ldm_coeff1
## 
## Parametric coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.00430    0.06785  14.801  < 2e-16 ***
## koi_duration    1.41230    0.09649  14.637  < 2e-16 ***
## koi_prad       -0.19688    0.26623  -0.739  0.45961    
## koi_model_snr  -0.04401    0.07035  -0.626  0.53163    
## koi_steff       0.23853    0.10635   2.243  0.02491 *  
## koi_ror         7.67484    0.42627  18.004  < 2e-16 ***
## koi_srho        0.02748    0.02981   0.922  0.35658    
## koi_ldm_coeff1  0.19630    0.07053   2.783  0.00538 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                edf Ref.df  Chi.sq p-value    
## s(koi_teq)   5.699  6.838 297.439  <2e-16 ***
## s(koi_slogg) 4.830  5.947  80.271  <2e-16 ***
## s(koi_srad)  6.249  7.163   6.793    0.49    
## s(koi_dor)   8.097  8.784 209.363  <2e-16 ***
## s(koi_smet)  5.997  6.962 148.411  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.539   Deviance explained = 45.7%
## -REML = 2483.8  Scale est. = 1         n = 6396

Parametric Coefficients (Linear Effects)

These coefficients are interpreted like those in a standard GLM: they represent the change in the log-odds of being a "false_positive" for a one-unit increase in the (scaled) predictor, holding other variables constant.

  • (Intercept) 1.00430 (***): The baseline log-odds of a KOI being a "false_positive" is 1.00430 when all linear predictors are zero and the smooth functions are at their average effect. This is highly significant.

  • koi_duration 1.41230 (***): Longer transit durations significantly increase the log-odds of being a "false_positive". (Odds Ratio ≈ e^1.41230 ≈ 4.10). This effect is consistent with and slightly stronger than in the previous GLM.

  • koi_prad -0.19688 (p=0.45961): Planetary radius is not a significant linear predictor of "false_positive" status in this model.

  • koi_model_snr -0.04401 (p=0.53163): The transit signal-to-noise ratio is not a significant linear predictor.

  • koi_steff 0.23853 (*): Higher stellar effective temperature (koi_steff) significantly increases the log-odds of being a "false_positive". (Odds Ratio ≈ e^0.23853 ≈ 1.27).

    Important Change: This is a flip in direction compared to the previous GLM where higher koi_steff decreased the odds of being a false positive. Now, in the context of this GAM (with other variables modeled non-linearly), higher stellar temperatures are associated with a higher likelihood of being a false positive. This suggests that the relationship is complex and was being influenced by the enforced linearity of other terms in the GLM.

  • koi_ror 7.67484 (***): The planet-to-star radius ratio (koi_ror) has a very large and highly significant positive effect on the log-odds of being a "false_positive". (Odds Ratio ≈ e^7.67484 ≈ 2153). This remains an extremely strong indicator, likely identifying eclipsing binaries. The effect size is even larger than in the GLM.

  • koi_srho 0.02748 (p=0.35658): Fitted stellar density is not a significant linear predictor.

  • koi_ldm_coeff1 0.19630 (**): The first limb darkening coefficient has a significant positive linear effect on the log-odds of being a "false_positive". (Odds Ratio ≈ e^0.19630 ≈ 1.22). This is consistent with the GLM.

Approximate Significance of Smooth Terms (Non-Linear Effects)

For these variables, the relationship with the log-odds of being a "false_positive" is not assumed to be linear. The edf (estimated degrees of freedom) indicates the complexity of the curve; an edf close to 1 suggests a near-linear relationship, while higher values indicate more “wiggles” or non-linearity.

  • s(koi_teq) edf=5.699 (***): Equilibrium temperature (koi_teq) has a highly significant and non-linear relationship with the likelihood of being a "false_positive". The edf of ~5.7 suggests a fairly complex curve. The linear GLM showed a simple positive association; this GAM indicates the true relationship is more nuanced across the range of koi_teq.
  • s(koi_slogg) edf=4.830 (**): Stellar surface gravity (koi_slogg) also has a highly significant and non-linear effect. The edf of ~4.8 indicates a complex relationship, not just a simple linear trend as was suggested by the GLM.
  • s(koi_srad) edf=6.249 (p=0.49): The smooth term for stellar radius (koi_srad) is not significant. Even when allowing for a non-linear relationship, koi_srad does not significantly help predict "false_positive" status in this model.
  • s(koi_dor) edf=8.097 (***): The planet-star distance over star radius (koi_dor) has a highly significant and very complex non-linear relationship (high edf of ~8.1). This indicates that specific ranges or patterns in koi_dor are associated with "false_positive" likelihood in a way that a straight line cannot capture.
  • s(koi_smet) edf=5.997 (***): Stellar metallicity (koi_smet) has a highly significant and non-linear effect. The edf of ~6 suggests a complex curve. The GLM showed a linear negative effect (higher metallicity → less likely FP). The GAM suggests this relationship may not be consistently linear across all metallicity values; for instance, the benefit might plateau or change at very low/high metallicities.

Summary and Key Insights from the GAM

  • Non-Linearity Matters: Several key predictors (koi_teq, koi_slogg, koi_dor, koi_smet) have significant non-linear relationships with the probability of a KOI being a "false_positive". This means their influence is more complex than a simple positive or negative trend. Visualizing these smooth terms is crucial.
  • koi_ror is King (for FPs): The planet-to-star radius ratio (koi_ror) remains an overwhelmingly strong linear predictor, with a massive coefficient, strongly indicating that large koi_ror values are characteristic of "false_positives" (likely eclipsing binaries).
  • Reversal of koi_steff Effect: The most notable change for linear terms is that koi_steff (stellar effective temperature) now increases the likelihood of a "false_positive". This highlights how accounting for non-linearities in other variables can change the apparent effect of those kept linear. It suggests the relationship between koi_steff and disposition is context-dependent.
  • Consistent Linear Effects: koi_duration and koi_ldm_coeff1 continue to show a positive linear association with "false_positive" likelihood.
  • Persistent Non-Significance: koi_prad, koi_model_snr, koi_srho (linear terms), and koi_srad (even as a smooth term) do not significantly contribute to the model.
  • Improved Model Fit: The GAM explains more deviance (45.7%) and has a higher pseudo R-squared (0.539) compared to the previous GLMs, suggesting it captures the underlying data patterns more effectively by allowing for these non-linearities.

Now we can see that all the splines are significant. Plot of the splines effect:

par(mfrow = c(2, 3))
for (i in 1:6) {
  plot(gam_model, select = i, shade = TRUE, shade.col = "lightblue")
  abline(h = 0, lty = "dashed")
}

Predict on the test set and evaluate.

gam_probs <- predict(gam_model, newdata = test_data_scaled, type = "response")
gam_probs <- pmax(pmin(gam_probs, 1 - 1e-15), 1e-15)
gam_preds <- factor(ifelse(gam_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

cm_gam <- confusionMatrix(gam_preds, test_target_factor, positive = positive_class)
cm_gam
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            695            165
##   false_positive       111            628
##                                          
##                Accuracy : 0.8274         
##                  95% CI : (0.808, 0.8456)
##     No Information Rate : 0.5041         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6546         
##                                          
##  Mcnemar's Test P-Value : 0.001422       
##                                          
##             Sensitivity : 0.7919         
##             Specificity : 0.8623         
##          Pos Pred Value : 0.8498         
##          Neg Pred Value : 0.8081         
##              Prevalence : 0.4959         
##          Detection Rate : 0.3927         
##    Detection Prevalence : 0.4622         
##       Balanced Accuracy : 0.8271         
##                                          
##        'Positive' Class : false_positive 
## 

Plot confusion matrix

cm_data <- as.data.frame(cm_gam$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_gam <- roc(response = test_target_factor, predictor = gam_probs, levels = levels(test_target_factor))
print(paste("GAM - AUC:", round(auc(roc_gam), 4)))
## [1] "GAM - AUC: 0.8937"

Plot ROC curve

plot(roc_gam,
  main = "ROC Curve - GAM",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

3.2 Interaction terms

Let’s try adding interaction terms to the original GLM model.

interaction_subset <- c("koi_teq", "koi_slogg", "koi_ror", "koi_smet", "koi_impact")
# Ensure these exist in the data
interaction_subset <- interaction_subset[interaction_subset %in% names(train_data_scaled)]

# Get all other predictors to include as main effects
all_predictors <- setdiff(names(train_data_scaled), target_variable_name)
other_predictors <- setdiff(all_predictors, interaction_subset)

Prepare the formula for the GLM model with interaction terms.

interaction_formula_str <- paste(
  target_variable_name, "~",
  paste0("(", paste(interaction_subset, collapse = " + "), ")^2"), # Pairwise interactions + main effects
  "+",
  paste(other_predictors, collapse = " + ")
) # Add main effects of others
interaction_formula <- as.formula(interaction_formula_str)
print(paste("Using interaction formula:", interaction_formula_str))
## [1] "Using interaction formula: koi_pdisposition ~ (koi_teq + koi_slogg + koi_ror + koi_smet + koi_impact)^2 + koi_period + koi_duration + koi_depth + koi_prad + koi_insol + koi_model_snr + koi_steff + koi_srad + koi_smass + koi_srho + koi_incl + koi_dor + koi_ldm_coeff1"

Fit the GLM model.

glm_interaction_model <- glm(interaction_formula,
  data = train_data_scaled,
  family = binomial(link = "logit")
)
summary(glm_interaction_model)
## 
## Call:
## glm(formula = interaction_formula, family = binomial(link = "logit"), 
##     data = train_data_scaled)
## 
## Coefficients:
##                        Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)           8.837e+14  9.182e+05  962444900   <2e-16 ***
## koi_teq               1.056e+15  1.991e+06  530666658   <2e-16 ***
## koi_slogg             4.326e+14  1.843e+06  234735460   <2e-16 ***
## koi_ror               1.465e+15  4.019e+06  364412212   <2e-16 ***
## koi_smet             -3.529e+14  1.036e+06 -340529745   <2e-16 ***
## koi_impact            9.351e+13  2.577e+06   36282262   <2e-16 ***
## koi_period            2.376e+14  1.521e+06  156148218   <2e-16 ***
## koi_duration          4.758e+14  1.162e+06  409363536   <2e-16 ***
## koi_depth             1.864e+14  1.361e+06  137001244   <2e-16 ***
## koi_prad              2.407e+14  2.072e+06  116192510   <2e-16 ***
## koi_insol             3.147e+13  2.523e+06   12472902   <2e-16 ***
## koi_model_snr        -4.602e+13  1.076e+06  -42785900   <2e-16 ***
## koi_steff            -1.351e+14  1.779e+06  -75962347   <2e-16 ***
## koi_srad              1.993e+14  1.613e+06  123600517   <2e-16 ***
## koi_smass             2.297e+14  1.987e+06  115628797   <2e-16 ***
## koi_srho              3.575e+13  9.337e+05   38291236   <2e-16 ***
## koi_incl              4.230e+13  1.600e+06   26439186   <2e-16 ***
## koi_dor               1.143e+14  1.539e+06   74218505   <2e-16 ***
## koi_ldm_coeff1        1.398e+14  1.364e+06  102506756   <2e-16 ***
## koi_teq:koi_slogg     1.687e+14  5.822e+05  289846990   <2e-16 ***
## koi_teq:koi_ror      -8.763e+14  2.987e+06 -293351502   <2e-16 ***
## koi_teq:koi_smet      2.522e+13  9.968e+05   25295452   <2e-16 ***
## koi_teq:koi_impact   -3.046e+14  2.613e+06 -116592077   <2e-16 ***
## koi_slogg:koi_ror    -3.951e+14  3.320e+06 -119029491   <2e-16 ***
## koi_slogg:koi_smet    1.152e+14  9.377e+05  122906001   <2e-16 ***
## koi_slogg:koi_impact -4.026e+13  2.231e+06  -18047890   <2e-16 ***
## koi_ror:koi_smet     -4.314e+13  1.748e+06  -24684677   <2e-16 ***
## koi_ror:koi_impact   -7.049e+13  1.159e+05 -608474447   <2e-16 ***
## koi_smet:koi_impact  -1.282e+14  1.778e+06  -72140653   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:   8866.2  on 6395  degrees of freedom
## Residual deviance: 106184.6  on 6367  degrees of freedom
## AIC: 106243
## 
## Number of Fisher Scoring iterations: 25

From the summary, we notice that this specific model output is not interpretable due to severe numerical problems, likely stemming from (quasi-)complete separation caused by the complexity of the interaction terms.

Predict on the test set and evaluate.

interaction_probs <- predict(glm_interaction_model, newdata = test_data_scaled, type = "response")
interaction_probs <- pmax(pmin(interaction_probs, 1 - 1e-15), 1e-15)
interaction_preds <- factor(ifelse(interaction_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

cm_interaction <- confusionMatrix(interaction_preds, test_target_factor, positive = positive_class)
cm_interaction
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            486             80
##   false_positive       320            713
##                                           
##                Accuracy : 0.7498          
##                  95% CI : (0.7279, 0.7709)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5009          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8991          
##             Specificity : 0.6030          
##          Pos Pred Value : 0.6902          
##          Neg Pred Value : 0.8587          
##              Prevalence : 0.4959          
##          Detection Rate : 0.4459          
##    Detection Prevalence : 0.6460          
##       Balanced Accuracy : 0.7510          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(cm_interaction$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_glm_interaction <- roc(response = test_target_factor, predictor = interaction_probs, levels = levels(test_target_factor))
print(paste("GLM with interaction - AUC:", round(auc(roc_glm_interaction), 4)))
## [1] "GLM with interaction - AUC: 0.751"

Plot ROC curve

plot(roc_glm_interaction,
  main = "ROC Curve - GLM with interaction",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

4 Lasso Regression

Preapare dataset and formula

train_formula_lasso <- as.formula(paste("~ .")) # Include all predictors
x_train <- model.matrix(train_formula_lasso, data = train_data_scaled[, all_predictors])[, -1] # Predictor matrix, remove intercept
y_train <- train_data_scaled[[target_variable_name]] # Target factor

4.1 Fit the model

set.seed(42) # for reproducibility of CV folds
lasso <- glmnet(x_train, y_train, family = "binomial", alpha = 1)
cv_lasso_fit <- cv.glmnet(x_train, y_train,
  family = "binomial",
  alpha = 1,
  type.measure = "auc", # Use AUC for CV evaluation
  nfolds = 10
)

4.2 Plot the results

Value of lambda selected via cross validation:

plot(cv_lasso_fit)

cv_lasso_fit$lambda.min
## [1] 0.01330885

Selected model in lasso path plot:

plot(lasso, xvar = "lambda")
abline(v = log(cv_lasso_fit$lambda.min), col = "red", lty = "dashed")

the Lasso estimated coefficients corresponding to the best lambda are:

lasso_coef <- coef(lasso,  s=cv_lasso_fit$lambda.min)
lasso_coef[lasso_coef[,1]!=0,]
##   (Intercept)    koi_period  koi_duration     koi_depth       koi_teq 
##     0.3658801     0.1994419     0.3712789     0.8213715     0.9106747 
## koi_model_snr    koi_impact       koi_ror      koi_incl       koi_dor 
##     0.1879024     0.3371364     0.9173861    -0.6193954     0.1320083 
##      koi_smet 
##    -0.4631438

4.3 LASSO Model Interpretation

The LASSO (Least Absolute Shrinkage and Selection Operator) model performs feature selection by shrinking some coefficients to zero. The non-zero coefficients below are those selected by LASSO as important predictors, using the lambda.min value from cross-validation.

Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE” (Positive Coefficients):

  • (Intercept) 0.3658801: This is the baseline log-odds of a KOI being a “FALSE POSITIVE” when all other predictor variables in the model are zero.
  • koi_ror (0.9173861): A larger planet-to-star radius ratio. This is a very strong indicator. A very large koi_ror often points towards an eclipsing binary system rather than a planet.
  • koi_teq (0.9106747): Higher equilibrium temperature. Objects with extremely high calculated T_eq might be misclassified astrophysical phenomena.
  • koi_depth (0.8213715): Deeper transits. Very deep events might be more likely to be stellar eclipsing binaries.
  • koi_duration (0.3712789): Longer transit durations. Some types of false positives (e.g., grazing eclipsing binaries) can produce long-duration events.
  • koi_impact (0.3371364): Higher impact parameter. This means the transit is more grazing, which is common in false positive scenarios.
  • koi_period (0.1994419): Longer orbital periods. This suggests a slight increase in the likelihood of being a false positive.
  • koi_model_snr (0.1879024): Higher model signal-to-noise ratio. While counterintuitive, extremely high SNR events might sometimes be non-planetary or indicative of issues that lead to a false positive classification.
  • koi_dor (0.1320083): Larger planet-star distance over star radius. This can indicate a more grazing transit geometry.

Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE” (i.e., INCREASED Likelihood of being a “CANDIDATE”) (Negative Coefficients):

  • koi_incl (-0.6193954): Higher orbital inclination. An inclination closer to 90 degrees (edge-on transit) is required for a true planetary transit, making it less likely to be a false positive.
  • koi_smet (-0.4631438): Higher stellar metallicity. Stars with higher metallicity are known to be more likely to host planets, making an object more likely to be a genuine candidate.

4.4 Evaluate the model

best_lambda <- cv_lasso_fit$lambda.min
x_test <- model.matrix(train_formula_lasso, data = test_data_scaled[, all_predictors])[, -1]

# Evaluate using the best lambda found by CV
lasso_probs <- predict(cv_lasso_fit, newx = x_test, s = best_lambda, type = "response")
lasso_probs <- as.vector(pmax(pmin(lasso_probs, 1 - 1e-15), 1e-15)) # Ensure stability
lasso_preds <- factor(ifelse(lasso_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

lasso_cm <- confusionMatrix(lasso_preds, test_target_factor, positive = positive_class)
lasso_cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            722            222
##   false_positive        84            571
##                                           
##                Accuracy : 0.8086          
##                  95% CI : (0.7885, 0.8276)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6167          
##                                           
##  Mcnemar's Test P-Value : 4.811e-15       
##                                           
##             Sensitivity : 0.7201          
##             Specificity : 0.8958          
##          Pos Pred Value : 0.8718          
##          Neg Pred Value : 0.7648          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3571          
##    Detection Prevalence : 0.4096          
##       Balanced Accuracy : 0.8079          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC curve

roc_lasso <- roc(response = test_target_factor, predictor = lasso_probs, levels = levels(test_target_factor))
print(paste("Lasso - AUC:", round(auc(roc_lasso), 4)))
## [1] "Lasso - AUC: 0.8816"

Plot ROC curve

plot(roc_lasso,
  main = "ROC Curve - Lasso",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

5 Ridge regression

5.1 Fit the model

set.seed(42) # for reproducibility of CV folds
ridge <- glmnet(x_train, y_train, family = "binomial", alpha = 0)
cv_ridge_fit <- cv.glmnet(x_train, y_train,
  family = "binomial",
  alpha = 0, # Set alpha to 0 for Ridge
  type.measure = "auc", # Use AUC for CV evaluation
  nfolds = 10
)

5.2 Plot the results

Value of lambda selected via cross validation:

plot(cv_ridge_fit)

cv_ridge_fit$lambda.min
## [1] 0.01800751

Selected model in ridge path plot:

plot(ridge, xvar = "lambda")
abline(v = log(cv_ridge_fit$lambda.min), col = "red", lty = "dashed")

the Lasso estimated coefficients corresponding to the best lambda are:

ridge_coef <- coef(ridge,  s=cv_ridge_fit$lambda.min)
ridge_coef[ridge_coef[,1]!=0,]
##    (Intercept)     koi_period   koi_duration      koi_depth       koi_prad 
##     0.39884348     0.19678576     0.46405878     0.67426574     0.44766663 
##        koi_teq      koi_insol  koi_model_snr      koi_steff      koi_slogg 
##     1.01258905    -0.35870599     0.35635727     0.02635554     0.22080297 
##       koi_srad      koi_smass     koi_impact        koi_ror       koi_srho 
##    -0.04054393     0.15144990     0.46024229     0.65050228     0.03576134 
##       koi_incl        koi_dor koi_ldm_coeff1       koi_smet 
##    -0.59836143     0.20858063     0.07115970    -0.52529285

5.3 Ridge Regression Model Interpretation

Ridge regression is another regularization technique that helps to prevent overfitting by shrinking the coefficient estimates towards zero. Unlike LASSO, Ridge typically does not shrink coefficients to be exactly zero, so it tends to keep all predictors in the model. The interpretation of the coefficients is similar to GLM and LASSO.

Variables Associated with an INCREASED Likelihood of being a “FALSE POSITIVE” (Positive Coefficients from the Ridge model):

  • (Intercept) 0.39884348: The baseline log-odds of being a “FALSE POSITIVE” when other predictors are zero.
  • koi_teq (1.01258905): Higher equilibrium temperature. This is the strongest positive predictor, suggesting objects with very high T_eq are much more likely to be false positives.
  • koi_depth (0.67426574): Deeper transits. A strong indicator for false positives.
  • koi_ror (0.65050228): Larger planet-to-star radius ratio. Also a strong indicator, often pointing to eclipsing binaries.
  • koi_duration (0.46405878): Longer transit durations.
  • koi_impact (0.46024229): Higher impact parameter (more grazing transits).
  • koi_prad (0.44766663): Larger planetary radius. While larger planets are easier to detect, very large apparent radii can be indicative of non-planetary objects.
  • koi_model_snr (0.35635727): Higher model signal-to-noise ratio.
  • koi_slogg (0.22080297): Higher stellar surface gravity.
  • koi_dor (0.20858063): Larger planet-star distance over star radius.
  • koi_period (0.19678576): Longer orbital periods.
  • koi_smass (0.15144990): Higher stellar mass.
  • koi_ldm_coeff1 (0.07115970): The first limb darkening coefficient. A small positive impact.
  • koi_srho (0.03576134): Higher stellar density. A very small positive impact.
  • koi_steff (0.02635554): Higher stellar effective temperature. A very small positive impact, which is interesting as in other models it sometimes points to “CANDIDATE”. This might be due to interactions or the nature of Ridge regularization.

Variables Associated with a DECREASED Likelihood of being a “FALSE POSITIVE” (i.e., INCREASED Likelihood of being a “CANDIDATE”) (Negative Coefficients from the Ridge model):

  • koi_incl (-0.59836143): Higher orbital inclination. This is the strongest negative predictor, meaning more edge-on transits strongly suggest a “CANDIDATE”.
  • koi_smet (-0.52529285): Higher stellar metallicity. Metal-rich stars are more likely to host planets.
  • koi_insol (-0.35870599): Higher insolation flux. Objects receiving more stellar flux are more likely to be candidates.
  • koi_srad (-0.04054393): Larger stellar radius. A very small negative impact.

5.4 Evaluate the model

best_lambda_ridge <- cv_ridge_fit$lambda.min
ridge_probs <- predict(cv_ridge_fit, newx = x_test, s = best_lambda_ridge, type = "response")
ridge_probs <- as.vector(pmax(pmin(ridge_probs, 1 - 1e-15), 1e-15)) # Ensure stability
ridge_preds <- factor(ifelse(ridge_probs > 0.5, positive_class, levels(test_target_factor)[1]), levels = levels(test_target_factor))

Confusion matrix

ridge_cm <- confusionMatrix(ridge_preds, test_target_factor, positive = positive_class)
ridge_cm
## Confusion Matrix and Statistics
## 
##                 Reference
## Prediction       candidate false_positive
##   candidate            717            214
##   false_positive        89            579
##                                           
##                Accuracy : 0.8105          
##                  95% CI : (0.7904, 0.8294)
##     No Information Rate : 0.5041          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6205          
##                                           
##  Mcnemar's Test P-Value : 1.051e-12       
##                                           
##             Sensitivity : 0.7301          
##             Specificity : 0.8896          
##          Pos Pred Value : 0.8668          
##          Neg Pred Value : 0.7701          
##              Prevalence : 0.4959          
##          Detection Rate : 0.3621          
##    Detection Prevalence : 0.4178          
##       Balanced Accuracy : 0.8099          
##                                           
##        'Positive' Class : false_positive  
## 

Plot confusion matrix

cm_data <- as.data.frame(lasso_cm$table)
# Create heatmap
ggplot(cm_data, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(colour = "white", linewidth = 0.5) + # Add lines between tiles
  geom_text(aes(label = Freq), color = "white", size = 6) + # Adjust text size
  # Use a color scheme less prone to issues for colorblindness if possible
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Frequency") +
  theme_minimal(base_size = 12) + # Set base font size
  labs(
    title = "Confusion Matrix Heatmap", # Corrected title
    x = "Actual Class",
    y = "Predicted Class"
  ) +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5), # Center title
    axis.text.x = element_text(angle = 0, vjust = 1, hjust = 0.5), # Adjust text angle if needed
    axis.title = element_text(size = 14),
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  )

ROC Curve

roc_ridge <- roc(response = test_target_factor, predictor = ridge_probs, levels = levels(test_target_factor))
print(paste("Ridge Regression - AUC:", round(auc(roc_ridge), 4)))
## [1] "Ridge Regression - AUC: 0.8828"

Plot ROC curve

plot(roc_ridge,
  main = "ROC Curve - Lasso",
  col = "blue",
  lwd = 2,
  print.auc = TRUE,
  print.auc.y = 0.75,
  print.auc.x = 0.75
)

6 Save model results for comparison

# Create data frame with model performance metrics
models_performance <- data.frame(
  Model = c("GLM", "GAM", "GLM with Interactions", "Lasso", "Ridge"),
  Accuracy = c(
    cm$overall["Accuracy"],
    cm_gam$overall["Accuracy"],
    cm_interaction$overall["Accuracy"],
    lasso_cm$overall["Accuracy"],
    ridge_cm$overall["Accuracy"]
  ),
  Sensitivity = c(
    cm$byClass["Sensitivity"],
    cm_gam$byClass["Sensitivity"],
    cm_interaction$byClass["Sensitivity"],
    lasso_cm$byClass["Sensitivity"],
    ridge_cm$byClass["Sensitivity"]
  ),
  Specificity = c(
    cm$byClass["Specificity"],
    cm_gam$byClass["Specificity"],
    cm_interaction$byClass["Specificity"],
    lasso_cm$byClass["Specificity"],
    ridge_cm$byClass["Specificity"]
  ),
  AUC = c(
    auc(roc_original),
    auc(roc_gam),
    auc(roc_glm_interaction),
    auc(roc_lasso),
    auc(roc_ridge)
  )
)

# Round numeric columns to 4 decimal places
models_performance[, 2:5] <- round(models_performance[, 2:5], 4)

# save to Rda
save(models_performance, file = "data/Rdas/models_performance.Rda")

# Display the results
models_performance